import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import geopandas as gpd
import pprint
import numpy as np
#plt.rcParams['figure.figsize'] = [20, 15]
As of Jan 1, 2022, we can find 11 AirBnb data sets on http://insideairbnb.com/get-the-data.html, which were scrapped one time per month between Dec 2020 and Nov 2021, with May 2021 skipped. The 11 datasets where downloaded and put in 11 folders.
Get two reference data:
We choose data of confirmed cases in NSW instead of Australia based on the assumption that the Covid situation in NSW has stronger impact on NSW AirBnb business than the Covid situations outside of NSW.
We choose data of Google Trends queried by users in Australia instead of NSW based on the assumption that NSW AirBnb guests are usually from all over Australia instead of NSW local residents. Opinions on Covid of people all over Australia may influence their decision on whether to book a NSW AirBnb property or not.
The numbers in Google Trends data indicate how the users' "interest" in term Covid changed during the given period. Roughly speaking, the highest frequency of search happening at a timepoint within the period is normalized to 100, and the frequencies happening at rest of the timepoints are normalzied to numbers <= 100, by comparing the frequencies against the highest.
# As of Jan 1, 2022, we can find 11 AirBnb data sets on http://insideairbnb.com/get-the-data.html,
# which were scrapped one time per month between Dec 2020 and Nov 2021, with May 2021 skipped.
# The 11 datasets where downloaded and put in 11 folders. The foler names are shown below.
AIRBNB_FOLDERS = ['data_airbnb_syd/20201214/',
'data_airbnb_syd/20210110/',
'data_airbnb_syd/20210208/',
'data_airbnb_syd/20210304/',
'data_airbnb_syd/20210410/',
'data_airbnb_syd/20210605/',
'data_airbnb_syd/20210705/',
'data_airbnb_syd/20210807/',
'data_airbnb_syd/20210908/',
'data_airbnb_syd/20211009/',
'data_airbnb_syd/20211106/']
# The file of confirmed Covid19 cases in NSW was downladed, and stored at location below
NSW_COVID_CASE_FILE = './data_covid/nsw_health/confirmed_cases_table1_location.csv'
# The file of Google Trends of query term 'Covid' was downloaded, and stored at location below
GOOGLE_TRENDS_COVID_FILE = './data_covid/google_trends/multiTimeline.csv'
In this use case we estimate the number of AirBnb properties in operation and their occupancy rate in the period between Dec, 2020 and Nov, 2021. The data were collected by 11 scrapings, one scraping executed in each month, with May 2021 being skipped. One scraping usually lasted for 1-2 day.
We treat the status of properties at the time they were scraped as a snapshot, and therefore as a sample of their status in usual days. We estimate the number of properties in operation by counting the listed properties at scraping time, and whether the properties were occupied/vacant by checking if they were available for booking at scraping time.
The second estimation is based on a simplistic assumption that "available" means "vacant", and "unavailable" means "occupied". In reality, a property can be unavailable for two reasons, namely it is booked/occupied by guests, or the host of the property blocks booking. We assume the second reason for unavailable is not common, and can be safely ignored in the analysis.
We observe changes of property number and occapancy rate from the dataset. In pursuing the causes of the changes, we use two reference data, New South Wales confirmed Covid case data and Google Trends data on query term Covid.
import os
available_property_nubers = []
unavailable_property_numbers = []
for folder in FOLDERS:
date_label = folder.split('/')[1]
last_scrape_date = pd.read_csv(os.path.join(folder, 'listings_detailed.csv'),
header='infer',
usecols=['id', 'calendar_last_scraped'],
parse_dates=['calendar_last_scraped'])
last_scrape_date = last_scrape_date.rename(columns={'id': 'listing_id',
'calendar_last_scraped': 'date'})
df_cal_temp = pd.read_csv(os.path.join(folder, 'calendar_detailed.csv'),
header='infer',
usecols=['listing_id', 'available', 'date'],
parse_dates=['date']
)
df_cal_on_scrape_day = pd.merge(df_cal_temp, last_scrape_date, how='inner', on=['listing_id', 'date'])
available_property_nubers.append((date_label, (df_cal_on_scrape_day['available']=='t').sum()))
unavailable_property_numbers.append((date_label, (df_cal_on_scrape_day['available']=='f').sum()))
df_property_availability = pd.DataFrame({'date': [d for d, _ in available_property_nubers],
'available': [a for _, a in available_property_nubers],
'unavailable': [u for _, u in unavailable_property_numbers]})
df_property_availability['date'] = pd.to_datetime(df_property_availability['date'])
df_property_availability['property_number'] = df_property_availability['available'] + df_property_availability['unavailable']
df_property_availability['unavailable_ratio'] = df_property_availability['unavailable'] / df_property_availability['property_number']
df_property_availability['available_ratio'] = df_property_availability['available'] / df_property_availability['property_number']
df_nsw_cases = pd.read_csv(NSW_COVID_CASE_FILE, header='infer', parse_dates=['notification_date'])
df_nsw_cases_count = df_nsw_cases[['notification_date']]
df_nsw_cases_count = df_nsw_cases_count.groupby('notification_date').size().reset_index(name='counts')
df_nsw_cases_count = df_nsw_cases_count.loc[(df_nsw_cases_count['notification_date'] >= '2020-11-01') & (df_nsw_cases_count['notification_date'] <= '2021-11-30')]
df_trends = pd.read_csv(GOOGLE_TRENDS_COVID_FILE, header=1, parse_dates=['Week'])
#df_trends['covid: (Australia)'] = df_trends['covid: (Australia)'].str.replace(r'<1', '0')
df_trends['covid: (Australia)'] = df_trends['covid: (Australia)'].apply(pd.to_numeric) # latest version of plotly has a bug, needing explicitly changing object to numeric
df_trends = df_trends.loc[(df_trends['Week'] >= '2020-11-1') & (df_trends['Week'] <= '2021-11-30')]
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Create figure with secondary y-axis
fig = make_subplots(rows=2,
cols=1,
specs=[[{"secondary_y": True}],
[{"secondary_y": True}]
],
shared_xaxes=True,
vertical_spacing=0.02,
#subplot_titles=("NSW AirBnB Listing, Vacancy V.S. Confirmed Case, Google trends <I>Covid<I>")
)
# Add traces
fig.add_trace(
go.Bar(x=df_property_availability['date'], y=df_property_availability['property_number'], name="Listed properties"),
secondary_y=False,
row=1,
col=1
)
fig.add_trace(
go.Bar(x=df_property_availability['date'], y=df_property_availability['available'], name="Vacant (~available) properties of AirBnb in NSW"),
secondary_y=False,
row=1,
col=1
)
fig.add_trace(
go.Scatter(x=df_property_availability['date'], y=df_property_availability['available_ratio'], name="Vacancy (~available) rate of AirBnb in NSW"),
secondary_y=True,
row=1,
col=1
)
# Set x-axis title
#fig.update_xaxes(title_text="Sampling Date", row=1, col=1)
# Set y-axes titles
fig.update_yaxes(title_text="Property Number (log)", type='log', secondary_y=False, row=1, col=1)
fig.update_yaxes(title_text="Vacancy (~Available) Rate", range=[0, 0.5], secondary_y=True, row=1, col=1)
# Add traces
fig.add_trace(
go.Bar(x=df_nsw_cases_count['notification_date'], y=df_nsw_cases_count['counts'], name="Confirmed cases in NSW"),
secondary_y=False,
row=2,
col=1
)
fig.add_trace(
go.Scatter(x=df_trends['Week'], y=df_trends['covid: (Australia)'], name="Australian users' interest in <I>Covid</I> (Google Trends)"),
secondary_y=True,
row=2,
col=1
)
# Set x-axis title
#fig.update_xaxes(title_text="Date", row=2, col=1)
# Set y-axes titles
fig.update_yaxes(title_text="Confirmed case (log)", type='log', secondary_y=False, row=2, col=1)
fig.update_yaxes(title_text="Australian users' interest in <I>Covid</I> on Google", secondary_y=True, row=2, col=1)
# Add figure title
fig.update_layout(
title_text="NSW AirBnB Listing, Vacancy V.S. Confirmed Case, Google trends <I>Covid</I>",
height=800
)
# Set x-axis title
#fig.update_xaxes(title_text="Date")
# Set y-axes titles
#fig.update_yaxes(title_text="NSW Google search term Covid", type='log', secondary_y=True)
#fig.update_yaxes(title_text="NSW confirmed case number (Logorithm)", type='log', secondary_y=False)
fig.show()
An initial and preliminary analysis of the diagram suggests:
TBD
TBD
df_listings_summary_20211106.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11156 | An Oasis in the City | 40855 | Colleen | NaN | Sydney | -33.86767 | 151.22497 | Private room | 65 | 90 | 196 | 2020-03-05 | 2.02 | 1 | 364 | 0 | NaN |
| 1 | 14250 | Manly Harbour House | 55948 | Heidi | NaN | Manly | -33.80084 | 151.26378 | Entire home/apt | 470 | 2 | 3 | 2021-06-01 | 0.04 | 2 | 129 | 1 | Exempt |
| 2 | 15253 | Unique Designer Rooftop Apartment in City Loca... | 59850 | Morag | NaN | Sydney | -33.87964 | 151.21680 | Private room | 79 | 2 | 377 | 2021-04-19 | 3.54 | 1 | 342 | 13 | PID-STRA-24061-7 |
| 3 | 44545 | Sunny Darlinghurst Warehouse Apartment | 112237 | Atari | NaN | Sydney | -33.87888 | 151.21439 | Entire home/apt | 130 | 90 | 76 | 2019-08-20 | 1.00 | 1 | 0 | 0 | NaN |
| 4 | 58506 | Studio Yindi @ Mosman, Sydney | 279955 | John | NaN | Mosman | -33.81748 | 151.23484 | Entire home/apt | 140 | 2 | 293 | 2021-03-22 | 3.13 | 1 | 311 | 11 | PID-STRA-6093 |
#df_listings_summary_20211106.describe(include='all')
df_listings_summary_20211106['id'].nunique()
20513
df_listings_summary_20211106.query("availability_365 == 365").nunique()
id 769 name 757 host_id 595 host_name 493 neighbourhood_group 0 neighbourhood 37 latitude 742 longitude 740 room_type 4 price 226 minimum_nights 26 number_of_reviews 71 last_review 342 reviews_per_month 125 calculated_host_listings_count 21 availability_365 1 number_of_reviews_ltm 24 license 68 dtype: int64
df_listings_summary_20211106.query("availability_365 == 364").nunique()
id 411 name 407 host_id 315 host_name 289 neighbourhood_group 0 neighbourhood 36 latitude 391 longitude 396 room_type 4 price 165 minimum_nights 21 number_of_reviews 64 last_review 239 reviews_per_month 103 calculated_host_listings_count 21 availability_365 1 number_of_reviews_ltm 24 license 61 dtype: int64
for k, v in df_bnb_listings_detailed.iloc[0].iteritems():
print(k,'\n' ,v)
id 11156 listing_url https://www.airbnb.com/rooms/11156 scrape_id 20211106153124 last_scraped 2021-11-07 name An Oasis in the City description Very central to the city which can be reached by an easy walk or by bus, with transport at the door, if required, and all amenities within easy reach.<br /><br /><b>The space</b><br />Potts Pt. is a vibrant and popular inner-city suburb & the area was described as 'Australia's Bohemian Heart' in the article by Raymond Bonner in the New York Times . Ultra-convenient & within walking distance of the central business district & all tourist sites such as: <br />the Botanic Gardens, <br />Opera House, <br />NSW Art Gallery, <br />Australian Museum, <br />Circular Quay, the Sydney Opera House, and BridgeClimb<br />Darling Harbour<br />Museum of Contemporary Art<br />Art Gallery of NSW<br />Sydney Museum<br />Hyde Park<br />Botanic Gardens<br />Chinatown, Paddy's Markets, and the Chinese Gardens<br />Paddington (& the Saturday markets)<br />The historic Rocks area (& the markets on Saturday & Sunday)<br /><br />It is 7 minutes walk to the Kings Cross.train (Metro) station & buses to the be neighborhood_overview It is very close to everything and everywhere, has many trees along the streets and is in a beautiful part of the city picture_url https://a0.muscache.com/pictures/2797669/17895d03_original.jpg host_id 40855 host_url https://www.airbnb.com/users/show/40855 host_name Colleen host_since 2009-09-23 host_location Potts Point, New South Wales, Australia host_about Recently retired, I've lived & worked on 4 continents & over the years have travelled thru about 50 countries. I live in inner-Sydney within walking distance of most places (except the beaches, which are approx. 30 minutes. away on public transport). Interests are travelling, wildlife, reading,snorkelling, theatre, ballet & interesting food. host_response_time nan host_response_rate nan host_acceptance_rate nan host_is_superhost f host_thumbnail_url https://a0.muscache.com/im/users/40855/profile_pic/1259121939/original.jpg?aki_policy=profile_small host_picture_url https://a0.muscache.com/im/users/40855/profile_pic/1259121939/original.jpg?aki_policy=profile_x_medium host_neighbourhood Potts Point host_listings_count 1.0 host_total_listings_count 1.0 host_verifications ['email', 'phone', 'reviews'] host_has_profile_pic t host_identity_verified f neighbourhood Potts Point, New South Wales, Australia neighbourhood_cleansed Sydney neighbourhood_group_cleansed nan latitude -33.86767 longitude 151.22497 property_type Private room in rental unit room_type Private room accommodates 1 bathrooms nan bathrooms_text 1 shared bath bedrooms 1.0 beds nan amenities ["TV", "Kitchen", "Refrigerator", "Extra pillows and blankets", "Cooking basics", "Stove", "Washer", "Dishwasher", "Hot water", "Dishes and silverware", "Essentials", "Backyard", "Oven", "Long term stays allowed", "Elevator", "Smoke alarm", "Single level home", "Hangers", "Shower gel", "Fire extinguisher", "Bed linens", "Wifi", "Microwave", "Hair dryer", "Shampoo", "Heating", "Iron", "Patio or balcony"] price $65.00 minimum_nights 90 maximum_nights 180 minimum_minimum_nights 90 maximum_minimum_nights 90 minimum_maximum_nights 180 maximum_maximum_nights 180 minimum_nights_avg_ntm 90.0 maximum_nights_avg_ntm 180.0 calendar_updated nan has_availability t availability_30 29 availability_60 59 availability_90 89 availability_365 364 calendar_last_scraped 2021-11-07 number_of_reviews 196 number_of_reviews_ltm 0 number_of_reviews_l30d 0 first_review 2013-11-12 last_review 2020-03-05 review_scores_rating 4.6 review_scores_accuracy 4.76 review_scores_cleanliness 4.31 review_scores_checkin 4.8 review_scores_communication 4.77 review_scores_location 4.82 review_scores_value 4.71 license nan instant_bookable f calculated_host_listings_count 1 calculated_host_listings_count_entire_homes 0 calculated_host_listings_count_private_rooms 1 calculated_host_listings_count_shared_rooms 0 reviews_per_month 2.02
df_bnb_listings_detailed.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20513 entries, 0 to 20512 Data columns (total 74 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 20513 non-null int64 1 listing_url 20513 non-null object 2 scrape_id 20513 non-null int64 3 last_scraped 20513 non-null object 4 name 20506 non-null object 5 description 19741 non-null object 6 neighborhood_overview 12365 non-null object 7 picture_url 20513 non-null object 8 host_id 20513 non-null int64 9 host_url 20513 non-null object 10 host_name 20508 non-null object 11 host_since 20508 non-null object 12 host_location 20493 non-null object 13 host_about 10978 non-null object 14 host_response_time 7217 non-null object 15 host_response_rate 7217 non-null object 16 host_acceptance_rate 8565 non-null object 17 host_is_superhost 20508 non-null object 18 host_thumbnail_url 20508 non-null object 19 host_picture_url 20508 non-null object 20 host_neighbourhood 11738 non-null object 21 host_listings_count 20508 non-null float64 22 host_total_listings_count 20508 non-null float64 23 host_verifications 20513 non-null object 24 host_has_profile_pic 20508 non-null object 25 host_identity_verified 20508 non-null object 26 neighbourhood 12367 non-null object 27 neighbourhood_cleansed 20513 non-null object 28 neighbourhood_group_cleansed 0 non-null float64 29 latitude 20513 non-null float64 30 longitude 20513 non-null float64 31 property_type 20513 non-null object 32 room_type 20513 non-null object 33 accommodates 20513 non-null int64 34 bathrooms 0 non-null float64 35 bathrooms_text 20489 non-null object 36 bedrooms 19078 non-null float64 37 beds 19646 non-null float64 38 amenities 20513 non-null object 39 price 20513 non-null object 40 minimum_nights 20513 non-null int64 41 maximum_nights 20513 non-null int64 42 minimum_minimum_nights 20513 non-null int64 43 maximum_minimum_nights 20513 non-null int64 44 minimum_maximum_nights 20513 non-null int64 45 maximum_maximum_nights 20513 non-null int64 46 minimum_nights_avg_ntm 20513 non-null float64 47 maximum_nights_avg_ntm 20513 non-null float64 48 calendar_updated 0 non-null float64 49 has_availability 20513 non-null object 50 availability_30 20513 non-null int64 51 availability_60 20513 non-null int64 52 availability_90 20513 non-null int64 53 availability_365 20513 non-null int64 54 calendar_last_scraped 20513 non-null object 55 number_of_reviews 20513 non-null int64 56 number_of_reviews_ltm 20513 non-null int64 57 number_of_reviews_l30d 20513 non-null int64 58 first_review 14878 non-null object 59 last_review 14878 non-null object 60 review_scores_rating 14878 non-null float64 61 review_scores_accuracy 14260 non-null float64 62 review_scores_cleanliness 14270 non-null float64 63 review_scores_checkin 14253 non-null float64 64 review_scores_communication 14270 non-null float64 65 review_scores_location 14254 non-null float64 66 review_scores_value 14249 non-null float64 67 license 6761 non-null object 68 instant_bookable 20513 non-null object 69 calculated_host_listings_count 20513 non-null int64 70 calculated_host_listings_count_entire_homes 20513 non-null int64 71 calculated_host_listings_count_private_rooms 20513 non-null int64 72 calculated_host_listings_count_shared_rooms 20513 non-null int64 73 reviews_per_month 14878 non-null float64 dtypes: float64(19), int64(21), object(34) memory usage: 11.6+ MB
df_bnb_cal_detailed.query("listing_id==11156")
| listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
|---|---|---|---|---|---|---|---|
| 4130 | 11156 | 2021-11-07 | f | $65.00 | $65.00 | 90.0 | 180.0 |
| 4746 | 11156 | 2021-11-08 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 4747 | 11156 | 2021-11-09 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 4748 | 11156 | 2021-11-10 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 4749 | 11156 | 2021-11-11 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 5416 | 11156 | 2022-11-02 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 5417 | 11156 | 2022-11-03 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 5418 | 11156 | 2022-11-04 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 5419 | 11156 | 2022-11-05 | t | $65.00 | $65.00 | 90.0 | 180.0 |
| 5420 | 11156 | 2022-11-06 | t | $65.00 | $65.00 | 90.0 | 180.0 |
365 rows × 7 columns
df_bnb_cal_detailed.iloc[0]
listing_id 837264 date 2021-11-07 00:00:00 available f price $40.00 adjusted_price $40.00 minimum_nights 90.0 maximum_nights 99.0 Name: 0, dtype: object
df_listing_neighbourhood = df_listings_summary_20211106.groupby('neighbourhood')['id', 'host_id'].nunique()
df_listing_neighbourhood.reset_index(inplace=True)
#df_listing_neighbourhood
<ipython-input-6-ddd5c519601f>:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
df_listing_neighbourhood = df_listings_summary_20211106.groupby('neighbourhood')['id', 'host_id'].nunique()
#fig, ax = plt.subplots(figsize=(18, 8))
#plt.xticks(rotation=75)
#ax = sns.barplot(data=df_listing_neighbourhood,
# x='neighbourhood', y='id')
fig = px.bar(df_listing_neighbourhood,
x='neighbourhood',
y='id',
hover_data=['id', 'host_id'],
labels={'id': 'number of properties', 'host_id': 'number of hosts'},
title='Number of properties in neighbourhoods',
color='id',
height=800
)
fig.show()
syd_geojson = gpd.read_file('./data_airbnb_syd/20211106/neighbourhoods.geojson')
#syd_geojson
syd_geojson_joined = syd_geojson.merge(df_listing_neighbourhood, on='neighbourhood')
#syd_geojson_joined
#fig, axes = plt.subplots(2, figsize=(40, 25))
#syd_geojson_joined.plot(column='id',
# ax=axes[0],
# legend=True,
# legend_kwds={'label': 'Number of properties', 'orientation': 'vertical'})
#
#syd_geojson_joined.plot(column='host_id',
# ax=axes[1],
# legend=True,
# legend_kwds={'label': 'Number of hosts', 'orientation': 'vertical'})
syd_geojson_joined.boundary.plot()
<AxesSubplot:>
syd_geojson_joined.centroid.plot()
<ipython-input-115-c3974a4a55f4>:1: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
<AxesSubplot:>
#syd_geojson_joined.explore(column='id',
# popup=['neighbourhood', 'id', 'host_id'],
# popup_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# tooltip=['neighbourhood', 'id', 'host_id'],
# tooltip_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# legend=True,
# legend_kwds={'caption': 'Number of properties'})
#syd_geojson_joined.explore(column='host_id',
# popup=['neighbourhood', 'id', 'host_id'],
# popup_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# tooltip=['neighbourhood', 'id', 'host_id'],
# tooltip_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# legend=True,
# legend_kwds={'caption': 'Number of hosts'},
# k=2)
gpd.tools.geocode(syd_geojson['neighbourhood'])
| geometry | address | |
|---|---|---|
| 0 | POINT (151.29989 -33.62316) | Pittwater, Australia |
| 1 | POINT (151.24403 -33.94552) | Randwick City Council, New South Wales, Australia |
| 2 | POINT (151.27693 -33.88393) | Waverley Council, New South Wales, Australia |
| 3 | POINT (145.02952 -24.80982) | Warringah, Blackall, Queensland, Australia |
| 4 | POINT (151.16972 -33.82320) | Lane Cove Municipal Council, 2066, New South W... |
| 5 | POINT (151.20920 -33.83432) | North Sydney Council, New South Wales, Australia |
| 6 | POINT (151.06568 -33.92390) | Canterbury-Bankstown Council, New South Wales,... |
| 7 | POINT (171.80984 -43.49418) | Canterbury, New Zealand / Aotearoa |
| 8 | POINT (151.07852 -33.87982) | Strathfield Municipal Council, New South Wales... |
| 9 | POINT (-122.23018 47.30754) | Auburn, Washington, United States |
| 10 | POINT (150.97102 -33.57531) | The Hills Shire Council, New South Wales, Aust... |
| 11 | POINT (151.13603 -33.82987) | The Council of the Municipality of Hunters Hil... |
| 12 | POINT (151.21645 -33.85482) | Sydney, New South Wales, Australia |
| 13 | POINT (-2.61760 53.66557) | Botany Bay Boatyard, United Kingdom |
| 14 | POINT (-90.68279 42.09749) | Hurstville, 52060, Iowa, United States |
| 15 | POINT (150.88931 -33.86744) | Fairfield City Council, New South Wales, Austr... |
| 16 | POINT (150.86484 -33.74019) | Blacktown City Council, New South Wales, Austr... |
| 17 | POINT (123.33315 -26.85927) | Holroyd Bluff, Australia |
| 18 | POINT (-1.25188 53.08977) | Ashfield, England, United Kingdom |
| 19 | POINT (151.15569 -33.91039) | Marrickville, 2204, Inner West Council, New So... |
| 20 | POINT (151.04930 -34.07173) | Sutherland Shire Council, New South Wales, Aus... |
| 21 | POINT (-84.03221 33.64303) | Rockdale County, Georgia, United States |
| 22 | POINT (147.24979 -42.81287) | Kogarah Avenue, 7011, City of Glenorchy, Tasma... |
| 23 | POINT (151.02136 -33.80697) | City of Parramatta Council, New South Wales, A... |
| 24 | POINT (151.10462 -33.80158) | Council of the City of Ryde, New South Wales, ... |
| 25 | POINT (147.10505 -20.21759) | Leichhardt Range, Australia |
| 26 | POINT (151.13178 -33.57241) | The Council of the Shire of Hornsby, New South... |
| 27 | POINT (151.14921 -33.72713) | Ku-ring-gai Council, New South Wales, Australia |
| 28 | POINT (151.18365 -33.80160) | Willoughby City Council, New South Wales, Aust... |
| 29 | POINT (150.89884 -33.95259) | Liverpool City Council, New South Wales, Austr... |
| 30 | POINT (150.71479 -33.74780) | Penrith City Council, New South Wales, Australia |
| 31 | POINT (151.09758 -33.84851) | City of Canada Bay Council, New South Wales, A... |
| 32 | POINT (-92.76383 38.03231) | Camden County, Missouri, United States |
| 33 | POINT (150.85366 -34.07379) | Campbelltown City Council, New South Wales, Au... |
| 34 | POINT (151.24762 -33.82854) | Mosman Municipal Council, 2088, New South Wale... |
| 35 | POINT (-93.20215 43.28718) | Manly, 50456, Iowa, United States |
| 36 | POINT (151.27333 -33.86183) | Woollahra Municipal Council, New South Wales, ... |
| 37 | POINT (151.10248 -33.88581) | Burwood Council, New South Wales, Australia |
fig = px.violin(df_listings_summary_20211106,
y="price",
x="neighbourhood",
color="neighbourhood",
box=True,
points='outliers',#points="all",
hover_data=['neighbourhood', 'price'],
#width=2000,
#height=2000,
title='Price in neighbourhoods'
)
fig.show()
df_listing_20211106_price = df_listings_summary_20211106.groupby('neighbourhood')['price'].agg([np.min, np.max, np.mean, np.median])
df_listing_20211106_price = df_listing_20211106_price.reset_index()
df_listing_20211106_price
| neighbourhood | amin | amax | mean | median | |
|---|---|---|---|---|---|
| 0 | Ashfield | 25 | 582 | 105.390244 | 80.0 |
| 1 | Auburn | 19 | 1624 | 136.118110 | 109.0 |
| 2 | Bankstown | 18 | 600 | 94.061224 | 70.0 |
| 3 | Blacktown | 18 | 900 | 91.371585 | 70.0 |
| 4 | Botany Bay | 17 | 1000 | 107.890026 | 80.0 |
| 5 | Burwood | 17 | 530 | 103.907609 | 75.0 |
| 6 | Camden | 40 | 389 | 121.512821 | 110.0 |
| 7 | Campbelltown | 28 | 400 | 106.515625 | 71.0 |
| 8 | Canada Bay | 29 | 1073 | 147.028807 | 110.0 |
| 9 | Canterbury | 13 | 1000 | 106.519048 | 80.0 |
| 10 | City Of Kogarah | 25 | 1750 | 129.515625 | 70.0 |
| 11 | Fairfield | 20 | 840 | 103.250000 | 85.0 |
| 12 | Holroyd | 22 | 339 | 108.125000 | 72.5 |
| 13 | Hornsby | 20 | 1466 | 207.228013 | 111.0 |
| 14 | Hunters Hill | 40 | 1600 | 248.692308 | 132.5 |
| 15 | Hurstville | 25 | 997 | 113.492063 | 76.0 |
| 16 | Ku-Ring-Gai | 20 | 1357 | 163.934343 | 108.5 |
| 17 | Lane Cove | 25 | 1800 | 218.403846 | 122.5 |
| 18 | Leichhardt | 30 | 1500 | 204.652083 | 150.0 |
| 19 | Liverpool | 24 | 514 | 123.197802 | 90.0 |
| 20 | Manly | 30 | 5000 | 355.029562 | 240.0 |
| 21 | Marrickville | 20 | 5000 | 127.655502 | 94.0 |
| 22 | Mosman | 36 | 4000 | 473.503704 | 257.5 |
| 23 | North Sydney | 19 | 2448 | 202.104698 | 143.0 |
| 24 | Parramatta | 20 | 600 | 105.693878 | 83.0 |
| 25 | Penrith | 35 | 1990 | 183.258065 | 114.0 |
| 26 | Pittwater | 26 | 10000 | 731.081212 | 460.0 |
| 27 | Randwick | 19 | 6165 | 195.104741 | 114.0 |
| 28 | Rockdale | 17 | 700 | 105.376016 | 80.0 |
| 29 | Ryde | 22 | 7429 | 126.357333 | 87.0 |
| 30 | Strathfield | 21 | 1999 | 125.058824 | 70.0 |
| 31 | Sutherland Shire | 30 | 1764 | 246.680000 | 161.0 |
| 32 | Sydney | 18 | 12000 | 184.779798 | 128.0 |
| 33 | The Hills Shire | 20 | 5897 | 247.757576 | 79.5 |
| 34 | Warringah | 25 | 6758 | 321.448756 | 189.0 |
| 35 | Waverley | 22 | 10000 | 265.547679 | 160.0 |
| 36 | Willoughby | 26 | 14568 | 239.201299 | 111.0 |
| 37 | Woollahra | 25 | 28613 | 402.283784 | 173.0 |
df_listing_20211106_price.iloc[0]
neighbourhood Ashfield amin 25 amax 582 mean 105.390244 median 80.0 Name: 0, dtype: object
#fig = px.choropleth(df_listing_20211106_price,
# geojson=syd_geojson,
# locations='neighbourhood',
# featureidkey='properties.neighbourhood',
# color='median',
# color_continuous_scale='Viridis',
# hover_name='neighbourhood',
# hover_data=['median', 'mean']
# )
#fig.update_layout(margin={"r":0, "t":0, "l": 0, "b": 0})
#fig.update_geos(fitbounds="locations")
#fig.show()
#dir(syd_geojson.centroid[0])
print(syd_geojson.centroid[0].coords[0])
## px.choropleth_mapbox does not calculate center and zoom automatically.
## randomly select one point within relevant area as the center of map.
#center_coords = syd_geojson.centroid[0].coords[0] # note: it is in format (point.x, point.y), so (longtitude, latitude)
#
#fig = px.choropleth_mapbox(df_listing_20211106_price,
# geojson=syd_geojson,
# locations='neighbourhood',
# featureidkey='properties.neighbourhood',
# color='median',
# color_continuous_scale='Viridis',
# hover_name='neighbourhood',
# hover_data=['median', 'mean'],
# mapbox_style="open-street-map",
# center={'lat': center_coords[1], 'lon': center_coords[0]}
# )
#fig.update_layout(margin={"r":0, "t":0, "l": 0, "b": 0})
#fig.update_geos(fitbounds="locations", visible=False) ## it is not working on px.choropleth_mapbox. need to set center manually
#
#fig.show()
#center_coords = syd_geojson.centroid[0].coords[0]
#
#fig = px.choropleth_mapbox(df_listing_20211106_price,
# geojson=syd_geojson,
# locations='neighbourhood',
# featureidkey='properties.neighbourhood',
# color='amax',
# color_continuous_scale="Viridis",
# hover_name='neighbourhood',
# hover_data=['amax', 'amin'],
# center={'lat': center_coords[1], 'lon': center_coords[0]}
# )
#fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
#fig.update_layout(mapbox_style="open-street-map")
#fig.update_geos(fitbounds="locations", visible=False)## it is not working on px.choropleth_mapbox. need to set center manually
# Note: if you put fig.update_layout() below instead of above (before fig.update_geos),
# no data will be shown on map.
#fig.update_layout(mapbox_style="open-street-map")
#fig.show()
import os
df_list = []
for folder in FOLDERS:
date_label = folder.split('/')[1]
file_path = os.path.join(folder, 'listings.csv')
df_temp = pd.read_csv(file_path, header='infer')
df_temp = df_temp.groupby('neighbourhood')['price'].agg([np.min, np.max, np.mean, np.median])
df_temp = df_temp.reset_index()
df_temp = df_temp.assign(listing_date=date_label)
df_list.append(df_temp)
df_pricing = pd.concat(df_list)
df_pricing
| neighbourhood | amin | amax | mean | median | listing_date | |
|---|---|---|---|---|---|---|
| 0 | Ashfield | 20 | 9021 | 148.242537 | 82.5 | 20201214 |
| 1 | Auburn | 19 | 13248 | 158.780220 | 100.0 | 20201214 |
| 2 | Bankstown | 18 | 600 | 96.950000 | 76.5 | 20201214 |
| 3 | Blacktown | 22 | 2000 | 92.877193 | 61.0 | 20201214 |
| 4 | Botany Bay | 15 | 2904 | 121.161600 | 80.0 | 20201214 |
| ... | ... | ... | ... | ... | ... | ... |
| 33 | The Hills Shire | 20 | 5897 | 247.757576 | 79.5 | 20211106 |
| 34 | Warringah | 25 | 6758 | 321.448756 | 189.0 | 20211106 |
| 35 | Waverley | 22 | 10000 | 265.547679 | 160.0 | 20211106 |
| 36 | Willoughby | 26 | 14568 | 239.201299 | 111.0 | 20211106 |
| 37 | Woollahra | 25 | 28613 | 402.283784 | 173.0 | 20211106 |
418 rows × 6 columns
df_pricing['listing_date_form'] = pd.to_datetime(df_pricing['listing_date'])
df_pricing['listing_date_form'] = df_pricing['listing_date_form'].astype(str)
df_pricing.sort_values('listing_date_form')
df_pricing
| neighbourhood | amin | amax | mean | median | listing_date | listing_date_form | |
|---|---|---|---|---|---|---|---|
| 0 | Ashfield | 20 | 9021 | 148.242537 | 82.5 | 20201214 | 2020-12-14 |
| 1 | Auburn | 19 | 13248 | 158.780220 | 100.0 | 20201214 | 2020-12-14 |
| 2 | Bankstown | 18 | 600 | 96.950000 | 76.5 | 20201214 | 2020-12-14 |
| 3 | Blacktown | 22 | 2000 | 92.877193 | 61.0 | 20201214 | 2020-12-14 |
| 4 | Botany Bay | 15 | 2904 | 121.161600 | 80.0 | 20201214 | 2020-12-14 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 33 | The Hills Shire | 20 | 5897 | 247.757576 | 79.5 | 20211106 | 2021-11-06 |
| 34 | Warringah | 25 | 6758 | 321.448756 | 189.0 | 20211106 | 2021-11-06 |
| 35 | Waverley | 22 | 10000 | 265.547679 | 160.0 | 20211106 | 2021-11-06 |
| 36 | Willoughby | 26 | 14568 | 239.201299 | 111.0 | 20211106 | 2021-11-06 |
| 37 | Woollahra | 25 | 28613 | 402.283784 | 173.0 | 20211106 | 2021-11-06 |
418 rows × 7 columns
df_pricing.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 418 entries, 0 to 37 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 neighbourhood 418 non-null object 1 amin 418 non-null int64 2 amax 418 non-null int64 3 mean 418 non-null float64 4 median 418 non-null float64 5 listing_date 418 non-null object 6 listing_date_form 418 non-null object dtypes: float64(2), int64(2), object(3) memory usage: 26.1+ KB
fig = px.bar(df_pricing,
x="neighbourhood",
y="median",
color="neighbourhood",
animation_frame="listing_date_form",
animation_group='neighbourhood',
range_x=[0,37],
range_y=[0,600])
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
fig = px.bar(df_pricing,
x="neighbourhood",
y="amin",
color="neighbourhood",
animation_frame="listing_date_form",
animation_group='neighbourhood',
range_x=[0,37],
range_y=[0,100])
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
fig = px.bar(df_pricing,
x="neighbourhood",
y="amax",
color="neighbourhood",
animation_frame="listing_date_form",
animation_group='neighbourhood',
range_x=[0,37],
range_y=[1,100000],
log_y=True
)
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
df_listings_summary_20201214 = pd.read_csv("./data_airbnb_syd/20201214/listings.csv", header='infer')
df_listings_summary_20201214['id'].unique().shape
print('number of properties {}'.format(df_listings_summary_20201214['id'].unique().shape[0]))
print('number of hosts {}'.format(df_listings_summary_20201214['host_id'].unique().shape[0]))
import os
property_numbers = []
host_numbers = []
for folder in FOLDERS:
date_label = folder.split('/')[1]
file_path = os.path.join(folder, 'listings.csv')
df_temp = pd.read_csv(file_path, header='infer')
property_numbers.append((date_label, df_temp['id'].unique().shape[0]))
host_numbers.append((date_label, df_temp['host_id'].unique().shape[0]))
property_numbers
host_numbers
df_property_host_numbers = pd.DataFrame({'date': [d for d, _ in property_numbers],
'property_number': [n for _, n in property_numbers],
'host_number': [n for _, n in host_numbers]})
df_property_host_numbers
| date | property_number | host_number | |
|---|---|---|---|
| 0 | 20201214 | 33871 | 25321 |
| 1 | 20210110 | 33902 | 25097 |
| 2 | 20210208 | 33630 | 24965 |
| 3 | 20210304 | 33229 | 24759 |
| 4 | 20210410 | 33229 | 24759 |
| 5 | 20210605 | 32079 | 24055 |
| 6 | 20210705 | 31899 | 23913 |
| 7 | 20210807 | 31764 | 23799 |
| 8 | 20210908 | 31030 | 23467 |
| 9 | 20211009 | 30223 | 22980 |
| 10 | 20211106 | 20513 | 14977 |
#import matplotlib.pyplot as plt
#plt.xticks(rotation=45)
#plt.title('Property and Host numbers 2020 Dec - 2021 Nov')
#
#plt.plot(df_property_host_numbers['date'], df_property_host_numbers['property_number'], label='Property Number')
#plt.plot(df_property_host_numbers['date'], df_property_host_numbers['host_number'], label='Host Number')
#
#plt.legend()
#sns.relplot(data=df_property_host_numbers, kind='line')
#pd.melt(df_property_host_numbers,
# id_vars=['date'],
# value_vars=['property_number', 'host_number'],
# var_name='number',
# )
#sns.relplot(data=pd.melt(df_property_host_numbers,
# id_vars=['date'],
# value_vars=['property_number', 'host_number'],
# var_name='type',
# value_name='number'),
# x='date',
# y='number',
# hue='type',
# kind='line')
fig, ax = plt.subplots()
plt.title('Property and Host numbers 2020 Dec - 2021 Nov')
plt.xticks(rotation=45)
ax = sns.lineplot(data=df_property_host_numbers, x='date', y='property_number', label='Property Number')
ax1 = sns.lineplot(data=df_property_host_numbers, x='date', y='host_number', label='Host Number')
plt.xlabel('date')
plt.ylabel('number')
plt.legend()
<matplotlib.legend.Legend at 0x7f89a82e79a0>
df_listings_detailed_20211106 = pd.read_csv("./data_airbnb_syd/20211106/listings_detailed.csv", header='infer')
df_listings_detailed_20211106.head()
| id | listing_url | scrape_id | last_scraped | name | description | neighborhood_overview | picture_url | host_id | host_url | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11156 | https://www.airbnb.com/rooms/11156 | 20211106153124 | 2021-11-07 | An Oasis in the City | Very central to the city which can be reached ... | It is very close to everything and everywhere,... | https://a0.muscache.com/pictures/2797669/17895... | 40855 | https://www.airbnb.com/users/show/40855 | ... | 4.77 | 4.82 | 4.71 | NaN | f | 1 | 0 | 1 | 0 | 2.02 |
| 1 | 14250 | https://www.airbnb.com/rooms/14250 | 20211106153124 | 2021-11-07 | Manly Harbour House | Beautifully renovated, spacious and quiet, our... | Balgowlah Heights is one of the most prestigio... | https://a0.muscache.com/pictures/56935671/fdb8... | 55948 | https://www.airbnb.com/users/show/55948 | ... | 4.33 | 4.67 | 4.33 | Exempt | f | 2 | 2 | 0 | 0 | 0.04 |
| 2 | 15253 | https://www.airbnb.com/rooms/15253 | 20211106153124 | 2021-11-07 | Unique Designer Rooftop Apartment in City Loca... | Penthouse living at it best ... You will be st... | The location is really central and there is nu... | https://a0.muscache.com/pictures/46dcb8a1-5d5b... | 59850 | https://www.airbnb.com/users/show/59850 | ... | 4.67 | 4.70 | 4.47 | PID-STRA-24061-7 | t | 1 | 0 | 1 | 0 | 3.54 |
| 3 | 44545 | https://www.airbnb.com/rooms/44545 | 20211106153124 | 2021-11-07 | Sunny Darlinghurst Warehouse Apartment | Sunny warehouse/loft apartment in the heart of... | Darlinghurst is home to some of Sydney's best ... | https://a0.muscache.com/pictures/a88d8e14-4f63... | 112237 | https://www.airbnb.com/users/show/112237 | ... | 4.96 | 4.93 | 4.77 | NaN | f | 1 | 1 | 0 | 0 | 1.00 |
| 4 | 58506 | https://www.airbnb.com/rooms/58506 | 20211106153124 | 2021-11-07 | Studio Yindi @ Mosman, Sydney | An open plan apartment, which opens onto a spa... | Mosman is a smart, middle to upper class subur... | https://a0.muscache.com/pictures/23497720/d30f... | 279955 | https://www.airbnb.com/users/show/279955 | ... | 4.90 | 4.72 | 4.67 | PID-STRA-6093 | f | 1 | 1 | 0 | 0 | 3.13 |
5 rows × 74 columns
df_listings_detailed_20211106.info()
df_listings_detailed_20211106.iloc[0]
df_listings_detailed_20211106['calendar_last_scraped'].unique()
array(['2021-11-07', '2021-11-06'], dtype=object)
df_neighbourhoods_csv = pd.read_csv('./data_airbnb_syd/20210110/neighbourhoods.csv', header='infer')
df_neighbourhoods_csv.head()
| neighbourhood_group | neighbourhood | |
|---|---|---|
| 0 | NaN | Ashfield |
| 1 | NaN | Auburn |
| 2 | NaN | Bankstown |
| 3 | NaN | Blacktown |
| 4 | NaN | Botany Bay |
df_neighbourhoods_csv.count()
neighbourhood_group 0 neighbourhood 38 dtype: int64
df_neighbourhoods_csv
| neighbourhood_group | neighbourhood | |
|---|---|---|
| 0 | NaN | Ashfield |
| 1 | NaN | Auburn |
| 2 | NaN | Bankstown |
| 3 | NaN | Blacktown |
| 4 | NaN | Botany Bay |
| 5 | NaN | Burwood |
| 6 | NaN | Camden |
| 7 | NaN | Campbelltown |
| 8 | NaN | Canada Bay |
| 9 | NaN | Canterbury |
| 10 | NaN | City Of Kogarah |
| 11 | NaN | Fairfield |
| 12 | NaN | Holroyd |
| 13 | NaN | Hornsby |
| 14 | NaN | Hunters Hill |
| 15 | NaN | Hurstville |
| 16 | NaN | Ku-Ring-Gai |
| 17 | NaN | Lane Cove |
| 18 | NaN | Leichhardt |
| 19 | NaN | Liverpool |
| 20 | NaN | Manly |
| 21 | NaN | Marrickville |
| 22 | NaN | Mosman |
| 23 | NaN | North Sydney |
| 24 | NaN | Parramatta |
| 25 | NaN | Penrith |
| 26 | NaN | Pittwater |
| 27 | NaN | Randwick |
| 28 | NaN | Rockdale |
| 29 | NaN | Ryde |
| 30 | NaN | Strathfield |
| 31 | NaN | Sutherland Shire |
| 32 | NaN | Sydney |
| 33 | NaN | The Hills Shire |
| 34 | NaN | Warringah |
| 35 | NaN | Waverley |
| 36 | NaN | Willoughby |
| 37 | NaN | Woollahra |
df_neighbourhoods_geo = pd.read_json('./data_airbnb_syd/20210110/neighbourhoods.geojson')
df_neighbourhoods_geo.head()
| type | features | |
|---|---|---|
| 0 | FeatureCollection | {'type': 'Feature', 'geometry': {'type': 'Mult... |
| 1 | FeatureCollection | {'type': 'Feature', 'geometry': {'type': 'Mult... |
| 2 | FeatureCollection | {'type': 'Feature', 'geometry': {'type': 'Mult... |
| 3 | FeatureCollection | {'type': 'Feature', 'geometry': {'type': 'Mult... |
| 4 | FeatureCollection | {'type': 'Feature', 'geometry': {'type': 'Mult... |
print(df_neighbourhoods_geo.iloc[0])
print(df_neighbourhoods_geo.iloc[0]['features'])